In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
datadir = 'data/'
In [2]:
college_financials = pd.read_csv(datadir + 'college_financials.csv', header=0)
college_financials = college_financials[(college_financials['sector'] != 0) & (college_financials['sector'] != 99)]
print('college_financials has {} records'.format(len(college_financials)))
college_financials has 86722 records
In [3]:
college_financials.describe().transpose()
Out[3]:
count mean std min 25% 50% 75% max
academic_year 86722.0 2.005665e+03 3.876564e+00 2.000000e+03 2.002000e+03 2.006000e+03 2.009000e+03 2.012000e+03
sector 86722.0 4.903888e+00 2.895528e+00 1.000000e+00 2.000000e+00 4.000000e+00 8.000000e+00 9.000000e+00
cpi_index 86722.0 1.966440e+02 1.905778e+01 1.693000e+02 1.782000e+02 1.990000e+02 2.146580e+02 2.275650e+02
cpi_scalar_2012 86722.0 8.641223e-01 8.374656e-02 7.439633e-01 7.830730e-01 8.744754e-01 9.432821e-01 1.000000e+00
hepi_index 86722.0 2.465199e+02 3.267799e+01 1.969000e+02 2.127000e+02 2.531000e+02 2.793000e+02 2.932000e+02
hepi_scalar_2012 86722.0 8.407910e-01 1.114529e-01 6.715553e-01 7.254434e-01 8.632333e-01 9.525921e-01 1.000000e+00
heca_index 86722.0 8.592626e+01 9.509068e+00 7.152066e+01 7.663315e+01 8.733172e+01 9.472764e+01 1.000000e+02
heca_scalar_2012 86722.0 8.592626e-01 9.509068e-02 7.152066e-01 7.663315e-01 8.733172e-01 9.472764e-01 1.000000e+00
gross_tuition_fees_rev 74502.0 2.336773e+07 7.285970e+07 1.000000e+01 9.532058e+05 4.499148e+06 1.781273e+07 3.552224e+09
net_tuition_fees_rev 74560.0 1.889903e+07 6.016779e+07 -2.736279e+09 8.673130e+05 4.037190e+06 1.483597e+07 3.269544e+09
rev_fed_grant 39661.0 1.619542e+07 6.574901e+07 1.000000e+00 4.094300e+05 2.031638e+06 7.572422e+06 1.760281e+09
rev_state_grant 30723.0 4.020453e+06 1.213434e+07 -8.220000e+02 2.009990e+05 8.215890e+05 3.144227e+06 3.295267e+08
rev_local_grant 17024.0 6.006923e+06 2.928161e+07 6.000000e+00 8.275325e+04 3.937420e+05 1.678190e+06 8.695333e+08
rev_state_local_grant_contract 31614.0 7.141843e+06 3.081124e+07 1.000000e+01 2.442330e+05 9.837210e+05 3.962840e+06 1.113421e+09
rev_fedapp_grant_contract 48541.0 1.418493e+07 6.152977e+07 1.000000e+00 3.189880e+05 1.472838e+06 6.224000e+06 1.760281e+09
investment 44468.0 6.135498e+06 1.269109e+08 -1.260496e+10 5.266750e+03 1.079425e+05 9.973115e+05 7.821743e+09
total_rev 74758.0 6.712868e+07 3.029614e+08 -1.011723e+10 1.700656e+06 9.159115e+06 3.644522e+07 1.029353e+10
grant01 69660.0 3.095791e+06 1.086538e+07 1.000000e+01 2.558220e+05 9.253975e+05 2.769794e+06 9.854184e+08
grant02 52969.0 7.039628e+05 3.028498e+06 1.000000e+00 4.554900e+04 1.578330e+05 4.750750e+05 1.800000e+08
grant03 45542.0 1.739505e+06 7.085529e+06 -7.785000e+03 6.247650e+04 3.496704e+05 1.465508e+06 3.650848e+08
grant04 4790.0 4.513565e+05 2.036737e+06 3.000000e+00 2.105075e+04 7.945203e+04 2.946475e+05 5.372136e+07
grant05 30363.0 2.441675e+06 9.455768e+06 1.000000e+00 9.814550e+04 4.179560e+05 1.430480e+06 3.161390e+08
grant06 30660.0 8.243258e+06 2.502514e+07 1.000000e+00 2.594512e+05 1.664286e+06 7.849699e+06 2.772758e+09
grant07 73466.0 9.056755e+06 2.828899e+07 1.000000e+00 3.485835e+05 1.826952e+06 7.864178e+06 2.773045e+09
institutional_grant_aid 47111.0 7.038948e+06 2.439117e+07 -2.141000e+03 7.655400e+04 6.999060e+05 5.226228e+06 2.772803e+09
institutional_grant_aid_share 47111.0 3.732092e-01 3.486803e-01 -1.929339e-03 5.207892e-02 2.507701e-01 6.986465e-01 1.065471e+01
tuition_discount 47000.0 1.891978e-01 9.691334e-01 -3.954510e-04 2.180133e-02 9.832919e-02 2.585874e-01 1.219512e+02
any_aid_pct 68169.0 8.143444e+01 6.365398e+01 0.000000e+00 7.100000e+01 8.700000e+01 9.600000e+01 1.168000e+04
fed_grant_pct 68154.0 5.246191e+01 3.867826e+01 0.000000e+00 3.100000e+01 5.000000e+01 7.300000e+01 4.080000e+03
fed_grant_avg_amount 61345.0 3.253228e+03 1.684984e+03 0.000000e+00 2.450000e+03 3.100000e+03 4.007000e+03 2.141440e+05
state_grant_num 68643.0 1.460436e+02 4.630185e+02 0.000000e+00 0.000000e+00 2.300000e+01 1.350000e+02 2.357000e+04
state_grant_pct 68109.0 2.245316e+01 3.113760e+01 0.000000e+00 0.000000e+00 1.300000e+01 3.800000e+01 3.880000e+03
inst_grant_pct 68116.0 2.657765e+01 3.374649e+01 0.000000e+00 0.000000e+00 9.000000e+00 4.600000e+01 2.800000e+02
inst_grant_avg_amount 42378.0 3.866184e+03 4.800312e+03 0.000000e+00 9.050000e+02 1.897000e+03 5.039750e+03 4.882500e+04
loan_pct 68133.0 5.069516e+01 4.525168e+01 0.000000e+00 1.800000e+01 5.700000e+01 7.800000e+01 3.460000e+03
loan_avg_amount 52985.0 4.901860e+03 2.735865e+03 0.000000e+00 3.018000e+03 4.384000e+03 6.342000e+03 9.794000e+04
tuition01_tf 48198.0 9.599122e+03 8.464764e+03 0.000000e+00 2.490000e+03 7.400000e+03 1.443900e+04 7.002400e+04
tuition02_tf 48201.0 9.695760e+03 8.393342e+03 0.000000e+00 2.790000e+03 7.440000e+03 1.443600e+04 7.002400e+04
tuition03_tf 48210.0 1.140903e+04 7.626917e+03 0.000000e+00 5.796000e+03 9.660000e+03 1.529600e+04 7.002400e+04
tuition05_tf 21600.0 1.012317e+04 7.519796e+03 0.000000e+00 4.824000e+03 8.304000e+03 1.323000e+04 5.445000e+04
tuition06_tf 21601.0 1.012623e+04 7.518132e+03 0.000000e+00 4.835000e+03 8.316000e+03 1.323000e+04 5.445000e+04
tuition07_tf 21607.0 1.193506e+04 7.054273e+03 0.000000e+00 7.200000e+03 1.044000e+04 1.500000e+04 5.445000e+04
total_exp 70108.0 6.172147e+07 2.727533e+08 1.000000e+00 1.543398e+06 7.527832e+06 3.304988e+07 9.974033e+09
total_eandg 74637.0 4.778788e+07 1.863159e+08 1.000000e+00 1.289342e+06 7.030557e+06 2.948296e+07 7.170057e+09
total_eandg_adj 74626.0 4.776531e+07 1.860493e+08 1.000000e+00 1.323640e+06 7.217064e+06 2.954793e+07 7.170057e+09
eandr 74470.0 3.424916e+07 1.119530e+08 1.000000e+00 1.287706e+06 6.797690e+06 2.663710e+07 4.350766e+09
eandr_degree 48789.0 9.790666e+04 3.331715e+05 6.285714e+00 4.191248e+04 5.909957e+04 8.789800e+04 2.467900e+07
average_subsidy 74818.0 1.525600e+07 6.982168e+07 -8.536687e+08 -1.156850e+05 8.869330e+05 9.776385e+06 2.972109e+09
gross_operating_margin 74793.0 9.242000e+06 1.186528e+08 -1.387330e+10 -2.381600e+04 2.406170e+05 2.586934e+06 6.989228e+09
total_assets 57866.0 1.821037e+08 1.163893e+09 -3.721908e+06 1.690362e+06 1.844450e+07 8.651108e+07 6.410306e+10
total_liabilities 57464.0 5.870972e+07 4.087362e+08 -2.695640e+05 6.041828e+05 4.029960e+06 2.667115e+07 3.992397e+10
total_net_assets 37104.0 1.905056e+08 1.007985e+09 -4.786206e+08 1.037543e+07 3.442604e+07 9.916914e+07 4.416421e+10
total_faculty_all 40648.0 4.024933e+02 1.073228e+03 0.000000e+00 3.100000e+01 1.150000e+02 3.270000e+02 2.800100e+04
all_employees 40722.0 8.511828e+02 2.467250e+03 0.000000e+00 6.300000e+01 2.440000e+02 6.470000e+02 7.216200e+04
ft_faculty_salary 44297.0 5.015538e+04 1.832936e+04 0.000000e+00 3.850370e+04 4.841380e+04 5.973557e+04 2.473920e+05
In [7]:
for c in college_financials.columns:
    df_temp = college_financials.copy()[college_financials[c].notnull()]
    need_log = False
    if df_temp[c].dtype in (object, np.int64):
        continue
    n = 0
    if (np.abs(df_temp[c].max() / df_temp[c].mean()) > 3):
        df_temp[c] = df_temp[c].apply(lambda x: np.sign(x) * np.log10(np.abs(x)) if not x == 0 else 0)
        n += 1
    plt.subplots(figsize=(16,8))
    sns.violinplot(x = "sector", y = c, data = df_temp, dropna=True)
    plt.title(c + ('(log {} times)'.format(n) if n > 0 else '') + 'datacount: ' + str(len(df_temp)/len(college_financials)))
    plt.savefig('exploration/college_financials/' + c + '.pdf')
C:\ProgramData\Miniconda3\lib\site-packages\matplotlib\pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

After initial examination of the data, it looks like overall quality of the data is inferior in sectors 0 and 99 where there are duplicates (in the case of 2010 data for Aveda Institute Chapel Hill) and the overall count is a very small percentage of all the data. so we are going to remove all sectors in 0 and 99

In [8]:
pd.value_counts(college_financials['sector']).sort_index().plot.bar()
plt.title('count by sector')
Out[8]:
Text(0.5,1,'count by sector')
In [9]:
pd.value_counts(college_financials['academic_year']).sort_index().plot.bar()
plt.title('count by academic_year')
Out[9]:
Text(0.5,1,'count by academic_year')
In [10]:
year_dict = {1:4, 2:4, 3:4, 4:2, 5:2, 6:2, 7:1, 8:1, 9:1, 0:0, 99:0}
college_financials['years'] = college_financials['sector'].apply(lambda x: year_dict[x])
In [11]:
for c in college_financials.columns:
    df_temp = college_financials.copy()[college_financials[c].notnull()]
    need_log = False
    if df_temp[c].dtype in (object, np.int64):
        continue
    n = 0
    if (np.abs(df_temp[c].max() / df_temp[c].mean()) > 3):
        df_temp[c] = df_temp[c].apply(lambda x: np.sign(x) * np.log10(np.abs(x)) if not x == 0 else 0)
        n += 1
    plt.subplots(figsize=(16,8))
    sns.violinplot(x = "years", y = c, data = df_temp, dropna=True)
    plt.title(c + ('(log {} times)'.format(n) if n > 0 else '') + 'datacount: ' + str(len(df_temp)/len(college_financials)))
    plt.savefig('exploration/college_financials/' + c + '.pdf')
C:\ProgramData\Miniconda3\lib\site-packages\matplotlib\pyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [55]:
len(college_financials)
Out[55]:
87560